import re
from scrapy import Spider, Request
from scrapy.loader import ItemLoader

from pythone_douban_all.items import PythoneDoubanAllItem

class douban_allSpider(Spider):
    name = 'douban_all'
    start_urls = ['https://movie.douban.com/top250']

    def parse(self, response):
        # 找到每个电影的链接
        list_a=response.xpath('//div[@class="hd"]/a')
        # 循环a标签，找到电影名称和网址链接
        for a in list_a:
            href = a.xpath('@href').extract()[0]
            movie_name = a.xpath('span[1]/text()').extract()[0]
            dinfo={'movie_name':movie_name}
            #调取详情页面
            yield Request(url=href, meta=dinfo, callback=self.details_parse)

        # 获取下一页的链接
        nextpage=response.xpath('//span[@class="next"]/a/@href').extract()
        if nextpage:
            # print(nextpage[0])
            nexturl='https://movie.douban.com/top250%s'%nextpage[0]
            # print(nexturl) # 测试下一页的网址
            # 重新回调函数
            # if nextpage[0] != '?start=25&filter=':
            yield Request(url=nexturl,callback=self.parse)

    def getinfobyre(self, instr, restr): # 正则表达式的处理
        # restr为需要查找的字符，instr为查找区域
        m = re.search(restr, instr, re.S) # re.S是把/n算一个整体
        # 判断是否非空
        if m:
            info = m.groups()[0]
        else:
            info = ""
        return info

    def details_parse(self, response): # 循环下一个页面
        dinfo = response.meta  # 获取元数据
        iteml = ItemLoader(item=PythoneDoubanAllItem(), response=response)
        iteml.add_value('movie_name',dinfo['movie_name']) # 电影名称
        iteml.add_xpath('movie_directors', '//a[@rel="v:directedBy"]/text()')  # 导演
        iteml.add_xpath('movie_actors', '//a[@rel="v:starring"]/text()')  # 演员
        iteml.add_xpath('movie_type', '//span[@property="v:genre"]/text()')  # 类型
        iteml.add_xpath('movie_showdate', '//span[@property="v:initialReleaseDate"]/text()')
        iteml.add_xpath('movie_runtime', '//span[@property="v:runtime"]/text()')  # 时长
        showplace = self.getinfobyre(response.text, r'制片国家/地区:</span>(.+?)<br/>')  # 制片国家
        iteml.add_value('movie_showplace', showplace)
        language = self.getinfobyre(response.text, r'语言:</span>(.+?)<br/>')  # 语言
        iteml.add_value('movie_language', language)
        othername = self.getinfobyre(response.text, r'又名:</span>(.+?)<br/>')  # 又名
        iteml.add_value('movie_othername', othername)
        iteml.add_xpath('movie_score', '//strong[@property="v:average"]/text()')  # 评分
        iteml.add_xpath('movie_vote', '//span[@property="v:votes"]/text()')  # 评价数
        return iteml.load_item()
